{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "@curiosity\n",
    "- 运行环境\n",
    "    - 系统版本\n",
    "        - win10\n",
    "    - 包版本\n",
    "        - numpy\n",
    "        - pandas\n",
    "        - sklearn\n",
    "        - lightgbm\n",
    "- 特征工程思路\n",
    "    - 构造新特征。例如：有效贷款总数、主账户违约比率、二级账户违约比率、总违约比率、总未还贷款金额比率等。\n",
    "    - 数值特征的统计特征。计算构造credit-score关于类别特征的mean统计特征。\n",
    "    - 尝试对age/loan_to_asset_ratio_bin进行较粗粒度的序数编码，希望增强泛化能力（但好像没什么用）。\n",
    "    - 目标编码（target encoding）。对employee_code_id/supplier_id/branch_id等类别特征做目标编码。\n",
    "    - 计数编码（count encoding）。对employee_code_id/supplier_id/branch_id等类别特征做计数编码。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-07-22T09:54:11.943857Z",
     "start_time": "2021-07-22T09:54:10.046743Z"
    }
   },
   "outputs": [],
   "source": [
    "## 导入第三方包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import lightgbm as lgb\n",
    "# import xgboost as xgb\n",
    "# import catboost as cat\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import f1_score, roc_auc_score\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from tqdm import tqdm\n",
    "import gc\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "pd.set_option('display.max_columns', 200)\n",
    "pd.set_option('display.max_rows', 100)\n",
    "train_path = '/Users/dingchaoshun/python-file/iFLYTEK_CarLoan/data/train.csv'\n",
    "test_path = '/Users/dingchaoshun/python-file/iFLYTEK_CarLoan/data/test.csv'\n",
    "\n",
    "## 读取数据集，具体下载方式可见操作手册\n",
    "train = pd.read_csv(train_path)\n",
    "test = pd.read_csv(test_path)\n",
    "train = train.rename(columns={'sub_account_inactive_loan_no':'total_inactive_loan_no', 'total_inactive_loan_no':'sub_account_inactive_loan_no'})\n",
    "test = test.rename(columns={'sub_account_inactive_loan_no':'total_inactive_loan_no', 'total_inactive_loan_no':'sub_account_inactive_loan_no'})\n",
    "target = 'loan_default'\n",
    "df_feature = train.append(test, sort=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# feature Engineering"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## create_feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-07-22T09:54:11.959832Z",
     "start_time": "2021-07-22T09:54:11.945855Z"
    }
   },
   "outputs": [],
   "source": [
    "# create feature\n",
    "def create_feature(df):\n",
    "    df['total_active_loan_no'] = df['main_account_active_loan_no'] + df['sub_account_active_loan_no']\n",
    "    df['total_inactive_loan_ratio'] = df['total_inactive_loan_no'] / (df['total_account_loan_no'] + 1e-5)\n",
    "    df['main_overdue_ratio'] = df['main_account_overdue_no'] / (df['main_account_active_loan_no'] + 1e-5)\n",
    "    df['sub_overdue_ratio'] = df['sub_account_overdue_no'] / (df['sub_account_active_loan_no'] + 1e-5)\n",
    "    df['total_overdue_ratio'] = df['total_overdue_no'] / (df['total_active_loan_no'] + 1e-5)\n",
    "    df['total_outstanding_ratio'] = df['total_outstanding_loan'] / (df['total_sanction_loan'] + 1e-5)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## make-ratio-bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## make-ratio-bins\n",
    "def make_data_ratio_bin(df):\n",
    "    df['loan_to_asset_ratio_bin'] = pd.qcut(df['loan_to_asset_ratio'], 100)\n",
    "    df['loan_to_asset_ratio_bin'] = df['loan_to_asset_ratio_bin'].cat.codes\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## make_age_bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-07-22T09:54:11.991014Z",
     "start_time": "2021-07-22T09:54:11.976832Z"
    }
   },
   "outputs": [],
   "source": [
    "# ## cut bins\n",
    "def make_age_bins(x):\n",
    "    if x <= 22: age_bin = 1\n",
    "    elif x > 22 and x <= 35: age_bin = 2 \n",
    "    elif x > 35 and x <= 45: age_bin = 3\n",
    "    elif x > 45 and x <= 60: age_bin = 4\n",
    "    else: age_bin = 5\n",
    "    return age_bin\n",
    "def make_data_age_bin(df):\n",
    "    df['age_bin'] = df['age'].apply(make_age_bins)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## make_stats_feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-07-22T09:54:12.022958Z",
     "start_time": "2021-07-22T09:54:12.008891Z"
    }
   },
   "outputs": [],
   "source": [
    "# statistic feature\n",
    "# credit_score 的统计特征\n",
    "# label_cols = ['credit_score','loan_to_asset_ratio','disbursed_amount']\n",
    "label_cols = ['credit_score']\n",
    "category_cols = ['branch_id','supplier_id','manufacturer_id','area_id','employee_code_id','employment_type','Credit_level']\n",
    "group_cols = category_cols\n",
    "# make_stats_feature\n",
    "def make_stats_feature(df, label_cols, group_cols):\n",
    "    for label in label_cols:\n",
    "        for group in group_cols:\n",
    "            df[f'{label}_by_{group}_mean'] = df.groupby(group)[label].transform('mean')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## count encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-07-22T09:54:12.038185Z",
     "start_time": "2021-07-22T09:54:12.023849Z"
    }
   },
   "outputs": [],
   "source": [
    "## count encoding\n",
    "count_cols = ['branch_id','supplier_id','manufacturer_id','area_id','employee_code_id','employment_type','Credit_level']\n",
    "def count_encoding(df, count_cols):\n",
    "    for col in tqdm(count_cols):\n",
    "        df[f'{col}_count'] = df[col].map(df[col].value_counts())\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-07-22T09:54:13.798021Z",
     "start_time": "2021-07-22T09:54:12.039182Z"
    }
   },
   "outputs": [],
   "source": [
    "## process_data\n",
    "def process_data(df):\n",
    "    df = create_feature(df)\n",
    "    df = make_data_age_bin(df)\n",
    "    df = make_stats_feature(df, label_cols, group_cols)\n",
    "    df = count_encoding(df, count_cols)\n",
    "    return df\n",
    "df_feature = process_data(df_feature)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5-fold target encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-07-22T09:54:22.195068Z",
     "start_time": "2021-07-22T09:54:13.800018Z"
    }
   },
   "outputs": [],
   "source": [
    "## Groupby statistic feature (mean\\std\\min\\max\\median)\n",
    "target_encode_cols = ['branch_id','supplier_id','manufacturer_id','area_id','employee_code_id','employment_type','Credit_level']\n",
    "\n",
    "def stat(df, df_merge, group_by, agg):\n",
    "    group = df.groupby(group_by).agg(agg)\n",
    "\n",
    "    columns = []\n",
    "    for on, methods in agg.items():\n",
    "        for method in methods:\n",
    "            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))\n",
    "    group.columns = columns\n",
    "    group.reset_index(inplace=True)\n",
    "    df_merge = df_merge.merge(group, on=group_by, how='left')\n",
    "\n",
    "    del (group)\n",
    "    gc.collect()\n",
    "\n",
    "    return df_merge\n",
    "\n",
    "def statis_feat(df_know, df_unknow):\n",
    "    for f in tqdm(target_encode_cols):\n",
    "        df_unknow = stat(df_know, df_unknow, [f], {target: ['mean','std']})\n",
    "    return df_unknow\n",
    "\n",
    "## 5折交叉 target encoding\n",
    "train = df_feature[~df_feature[target].isnull()]\n",
    "train = train.reset_index(drop=True)\n",
    "test = df_feature[df_feature[target].isnull()]\n",
    "\n",
    "df_stas_feat = None\n",
    "kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)\n",
    "for tra_index, val_index in kfold.split(train, train[target]):\n",
    "    df_fold_train = train.iloc[tra_index]\n",
    "    df_fold_val = train.iloc[val_index]\n",
    "\n",
    "    df_fold_val = statis_feat(df_fold_train, df_fold_val)\n",
    "    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)\n",
    "\n",
    "    del(df_fold_train)\n",
    "    del(df_fold_val)\n",
    "    gc.collect()\n",
    "\n",
    "test = statis_feat(train, test)\n",
    "df_feature = pd.concat([df_stas_feat, test], axis=0)\n",
    "\n",
    "del(df_stas_feat)\n",
    "del(train)\n",
    "del(test)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## split data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-07-22T09:54:22.562088Z",
     "start_time": "2021-07-22T09:54:22.198064Z"
    }
   },
   "outputs": [],
   "source": [
    "## 训练数据及测试数据准备\n",
    "train = df_feature[df_feature[target].notnull()]\n",
    "test = df_feature[df_feature[target].isnull()]\n",
    "\n",
    "useless_cols = ['customer_id','loan_default','mobileno_flag','idcard_flag','disbursed_date']\n",
    "all_cols = [col for col in train.columns if col not in useless_cols]\n",
    "x_train = train[all_cols]\n",
    "x_test = test[all_cols]\n",
    "y_train = train[target]\n",
    "\n",
    "print(x_train.shape)\n",
    "print(x_test.shape)\n",
    "x_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LGB Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-07-22T10:05:49.023903Z",
     "start_time": "2021-07-22T09:54:22.564109Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "## 作为baseline部分仅使用经典的LightGBM作为训练模型，我们还能尝试XGBoost、CatBoost和NN（神经网络）\n",
    "def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'):\n",
    "    folds = 10\n",
    "    seed = 2021\n",
    "    kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)\n",
    "\n",
    "    train_pred_lst = np.zeros(train_x.shape[0])\n",
    "    test_pred_lst = np.zeros(test_x.shape[0])\n",
    "\n",
    "    cv_scores = []\n",
    "\n",
    "    for i, (train_index, valid_index) in enumerate(kfold.split(train_x, train_y)):\n",
    "        print('************************************ {} ************************************'.format(str(i+1)))\n",
    "        trn_x, trn_y = train_x.iloc[train_index], train_y.iloc[train_index]\n",
    "        val_x, val_y = train_x.iloc[valid_index], train_y.iloc[valid_index]\n",
    "\n",
    "        train_matrix = clf.Dataset(trn_x, label=trn_y)\n",
    "        valid_matrix = clf.Dataset(val_x, label=val_y)\n",
    "\n",
    "        params = {\n",
    "            'boosting_type': 'gbdt',\n",
    "            'objective': 'binary',\n",
    "            'metric': 'auc',\n",
    "            'min_child_weight': 5,\n",
    "            'num_leaves': 2 ** 7,\n",
    "            'lambda_l2': 10,\n",
    "            'feature_fraction': 0.9,\n",
    "            'bagging_fraction': 0.9,\n",
    "            'bagging_freq': 4,\n",
    "            'learning_rate': 0.01,\n",
    "            'seed': 2021,\n",
    "            'n_jobs': -1,\n",
    "            'silent': True,\n",
    "            'verbose': -1,\n",
    "        }\n",
    "\n",
    "        model = clf.train(params, train_matrix, 10000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500,early_stopping_rounds=200)\n",
    "        val_pred = model.predict(val_x, num_iteration=model.best_iteration)\n",
    "        test_pred = model.predict(test_x, num_iteration=model.best_iteration)\n",
    "        train_pred_lst[valid_index] = val_pred\n",
    "        test_pred_lst += test_pred / kfold.n_splits\n",
    "        cv_scores.append(roc_auc_score(val_y, val_pred))\n",
    "        \n",
    "        print(cv_scores)\n",
    "    \n",
    "    mean_auc = round(np.mean(cv_scores), 6)\n",
    "    print(\"%s_scotrainre_list:\" % clf_name, cv_scores)\n",
    "    print(\"%s_score_mean:\" % clf_name, np.mean(cv_scores))\n",
    "    print(\"%s_score_std:\" % clf_name, np.std(cv_scores))\n",
    "    return train_pred_lst, test_pred_lst, mean_auc\n",
    "\n",
    "lgb_train, lgb_test, lgb_score = cv_model(lgb, x_train, y_train, x_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Submit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-07-22T10:05:49.101434Z",
     "start_time": "2021-07-22T10:05:49.025899Z"
    }
   },
   "outputs": [],
   "source": [
    "## 预测结果\n",
    "submit = test[['customer_id']]\n",
    "submit['loan_default'] = lgb_test\n",
    "submit['loan_default'] = submit['loan_default'].apply(lambda x:1 if x>0.25 else 0).values\n",
    "submit.to_csv(f'mac_submit_{lgb_score}.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "349.091px"
   },
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
